In [112]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
In [23]:
stop = set(stopwords.words('english'))
path = "sentiment_analysis/data/"
train_data_path ='sentiment_analysis/data/training_data/train.csv'

def readData(path):
    # read dictionary into df
    df_dict_data = pd.read_table(path + 'dictionary.txt')
    df_dict_prepared = df_dict_data['!|0'].str.split('|', expand=True).rename(columns={0: 'Phrase', 1: 'phrase_ids'})

    # read sentiment labels into df
    df_labels = pd.read_table(path + 'sentiment_labels.txt')
    df_labels_prepared = df_labels['phrase ids|sentiment values'].str.split('|', expand=True).rename(columns={0: 'phrase_ids', 1: 'sentiment_values'})

    #combine data frames containing sentence and sentiment
    df_all_prepared = df_dict_prepared.merge(df_labels_prepared, how='inner', on='phrase_ids')
    return df_all_prepared
    
df_all_prepared = readData(path)
training_data = pd.read_csv(train_data_path, encoding='iso-8859-1')
In [7]:
training_data.head()
Out[7]:
index_orig Phrase phrase_ids sentiment_values
0 0 ! 0 0.50000
1 1 ! ' 22935 0.52778
2 2 ! '' 18235 0.50000
3 5 ! Brilliant ! 40532 0.93056
4 6 ! Brilliant ! ' 22937 1.00000
In [8]:
# list(training_data['Phrase'][i] if  0.1 < training_data['sentiment_values'][i] < .2 else 1 for i in range(len(training_data)))
In [3]:
fig,ax = plt.subplots(1,1)
training_data.hist(column = 'sentiment_values', ax = ax)
ax.set_title('Number of entries classified by sentiment analysis')
ax.set_xticks([i/10 for i in range(0,10)])
print('Percent of entries in range [0.5, 0.6] =  %.3f %%'%(100*(sum(list(filter(lambda x: 0.5 < x < 0.6, training_data['sentiment_values'])))/len(training_data))))
Percent of entries in range [0.5, 0.6] =  9.383 %

First we read in the embeddings file into a dictionary - each entry is a word, followed by the vector of numbers to represent its values

In [4]:
embeddings_index = {}
f = open(path + '/glovo/glove_6B/glove.6B.300d.txt')
for line in f:
    values = line.split(' ')
    word = values[0] ## The first entry is the word
    coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
    embeddings_index[word] = coefs
f.close()

print('GloVe data loaded')
GloVe data loaded
In [5]:
# example of the word representation in terms of vector
embeddings_index[','][:5]
Out[5]:
array([-0.25539 , -0.25723 ,  0.13169 , -0.042688,  0.21817 ],
      dtype=float32)
In [6]:
import re

def clearWords(lines_without_stopwords, training_data, col_name="Phrase"):
    for line in training_data[col_name].values:
        line = line.lower()
        line_by_words = re.findall(r'(?:\w+)', line, flags = re.UNICODE)
        line_no_stps = list(filter(lambda word: word not in stop, line_by_words))
        lines_without_stopwords.append(line_no_stps)
    
lines_without_stopwords=[]
clearWords(lines_without_stopwords, training_data)
In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

MAX_NUM_WORDS = 1000
MAX_SEQUENCE_LENGTH = 100
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(lines_without_stopwords)
sequences = tokenizer.texts_to_sequences(lines_without_stopwords) # set numbers to the words

word_index = tokenizer.word_index # create index for each word
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) # zero matrix

labels = to_categorical(np.asarray([round(i*10) if round(i*10) != 10 else 9 for i in training_data['sentiment_values']]), num_classes=10, dtype='float32')
print(data.shape)
print(labels.shape)
Using TensorFlow backend.
Found 17927 unique tokens.
(191646, 100)
(191646, 10)

Form matrix word - its vector

In [8]:
# prepare embedding matrix 
from keras.layers import Embedding
from keras.initializers import Constant

## EMBEDDING_DIM =  ## seems to need to match the embeddings_index dimension
EMBEDDING_DIM = embeddings_index.get('a').shape[0]
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    ## This references the loaded embeddings dictionary
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
In [9]:
print(embedding_matrix.shape)
plt.plot(embedding_matrix[word_index.get('probably')]) # 0.5 - sent analysis - blue
plt.plot(embedding_matrix[word_index.get('best')]) # 0.9-1 - sent analysis - orange
# plt.plot(embedding_matrix[word_index.get('invited')]) # 0.1-0.2 - sent analysis - green
plt.title('example vectors')
(1001, 300)
Out[9]:
Text(0.5, 1.0, 'example vectors')

Here we can see the notation from where we have the values in emb_matrix

In [16]:
# print(word_index.get('invited'))
# print(embeddings_index['invited'][:5])
# print(embedding_matrix[6128][:5])
In [10]:
## To create and visualize a model

from keras.models import Sequential
from keras.layers import Dense, Bidirectional, Dropout, LSTM
In [18]:
model = Sequential()
model.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix], input_length=100, trainable=False))

model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.50))
model.add(Dense(10, activation='softmax'))
# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_2 (Embedding)      (None, 100, 300)          300300    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               439296    
_________________________________________________________________
dense_1 (Dense)              (None, 512)               131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                5130      
=================================================================
Total params: 876,310
Trainable params: 576,010
Non-trainable params: 300,300
_________________________________________________________________
None
In [20]:
print(labels.shape)
model.fit(data, np.array(labels), validation_split=0.1, epochs = 1)
(191646, 10)
Train on 172481 samples, validate on 19165 samples
Epoch 1/1
172481/172481 [==============================] - 4134s 24ms/step - loss: 1.7426 - accuracy: 0.3700 - val_loss: 1.7095 - val_accuracy: 0.3737
Out[20]:
<keras.callbacks.callbacks.History at 0x1a247d8210>
In [21]:
from sklearn.manifold import TSNE
## Get weights
embds = model.layers[0].get_weights()[0]
## Plotting function
## Visualize words in two dimensions 
tsne_embds = TSNE(n_components=2).fit_transform(embds)

plt.plot(tsne_embds[:,0],tsne_embds[:,1],'.')
Out[21]:
[<matplotlib.lines.Line2D at 0x1a5d6e4a50>]
In [22]:
# model.save(path + 'my_model.h5')

To run the following steps we don't have to train the model

  • load the model
In [24]:
from keras.models import load_model
model = load_model(path + 'my_model.h5')

Sentiment analysis per essay

designed the model to provide a sentiment score between 0 to 1 with 0 being very negative and 1 being very positive. This was done by building a multi-class classification model i.e 10 class, one class for each decile.

In [25]:
def clearByDividerWords(lines_without_stopwords, training_data, col_name="Phrase", div):
    for line in training_data[col_name].values:
        for div_part in line.split(div):
            div_part = div_part.lower()
            div_part_by_words = re.findall(r'(?:\w+)', sent, flags = re.UNICODE)
            div_part_no_stps = list(filter(lambda word: word not in stop, div_part_by_words))
            div_part_without_stopwords.append(div_part_no_stps)

def generate_essay_data_sentiment_matrix(filename):
    project_data_path = 'essayVSreview/'

    opened_filename = pd.read_csv(project_data_path + filename, sep='\t')
    opened_filename.head()
    opened_filename_without_stops = []
    
    # whole essay
    clearWords(opened_filename_without_stops, opened_filename, 'ReviewText')
    sequences_opened_filename = tokenizer.texts_to_sequences(opened_filename_without_stops)
    data_opened_filename = pad_sequences(sequences_opened_filename, maxlen=MAX_SEQUENCE_LENGTH) # zero matrix
    pred_essay = model.predict(data_opened_filename)
    return pred_essay
In [358]:
pred_essay_gay_merriage = generate_essay_data_sentiment_matrix('GayMarriage_400.csv')
pred_essay_gun_control = generate_essay_data_sentiment_matrix('GunControl_400.csv')
pred_essay_review_amt = generate_essay_data_sentiment_matrix('ReviewAMT_500_t.csv')
In [413]:
# sentiment analysis of each essay in gay_merriage
def show_each_essay_sentiment(pred, file, task, task_desc):
    for i in range(task, 1600, 4):
        plt.plot(pred[i])
    plt.title('sentiment analysis of each essay in ' + file + ": " + task_desc)
    plt.xlabel('sentiment class')
    plt.ylabel('power value')
    plt.show()
    
copy1, copy2, true, fake = 0, 1, 2, 3
copy_1_desc, copy_2_desc, true_desc, fake_desc = "Copy_1", "Copy2", "True Essay", "Fake Essay"
show_each_essay_sentiment(pred_essay_gay_merriage, 'Gay Meriage', copy1, copy_1_desc)
show_each_essay_sentiment(pred_essay_gay_merriage, 'Gay Meriage', copy2, copy_2_desc)
show_each_essay_sentiment(pred_essay_gay_merriage, 'Gay Meriage', true, true_desc)
show_each_essay_sentiment(pred_essay_gay_merriage, 'Gay Meriage', fake, fake_desc)

show_each_essay_sentiment(pred_essay_gun_control, 'Gun Control', copy1, copy_1_desc)
show_each_essay_sentiment(pred_essay_gun_control, 'Gun Control', copy2, copy_2_desc)
show_each_essay_sentiment(pred_essay_gun_control, 'Gun Control', true, true_desc)
show_each_essay_sentiment(pred_essay_gun_control, 'Gun Control', fake, fake_desc)

show_each_essay_sentiment(pred_essay_review_amt, 'Review AMT', copy2, copy_2_desc)
show_each_essay_sentiment(pred_essay_review_amt, 'Review AMT', copy1, copy_1_desc)
show_each_essay_sentiment(pred_essay_review_amt, 'Review AMT', true, true_desc)
show_each_essay_sentiment(pred_essay_review_amt, 'Review AMT', fake, fake_desc)
Conclusion:

For each file we have as positive thoughts as negative.

For review esaays it's much esaier to write narrow-minded sentances, so we can see no doubts in the midle of the plots.

Sentiment analysis per sentance

In [361]:
def clearByDividerWords (lines_without_stopwords, training_data, div, col_name="Phrase"):
    # by row
    for line in training_data[col_name].values:
        div_part_without_stopwords = []
        for div_part in line.split(div):
            div_part = " ".join([i.lower() for i in div_part])
            div_part_by_words = re.findall(r'(?:\w+)', div_part, flags = re.UNICODE)
            div_part_no_stps = list(filter(lambda word: word not in stop, div_part_by_words))
            div_part_without_stopwords.append(div_part_no_stps)
        lines_without_stopwords.append(div_part_without_stopwords)
            

def generate_sent_data_sentiment_matrix(filename):
    project_data_path = 'essayVSreview/'
    opened_filename = pd.read_csv(project_data_path + filename, sep='\t')
    opened_filename.head()
    opened_filename_without_stops = []

    pred_essay = []
    clearByDividerWords(opened_filename_without_stops, gay_merriage, '.', 'ReviewText')
    for sent_without_stops in opened_filename_without_stops:
        sequences_sent = tokenizer.texts_to_sequences(sent_without_stops)
        sent_matrix = pad_sequences(sequences_sent, maxlen=MAX_SEQUENCE_LENGTH) # zero matrix
        pred_essay.append(model.predict(sent_matrix))
    return pred_essay
In [362]:
pred_sent_gay_merriage = generate_sent_data_sentiment_matrix('GayMarriage_400.csv')
pred_sent_gun_control = generate_sent_data_sentiment_matrix('GunControl_400.csv')
pred_sent_review_amt = generate_sent_data_sentiment_matrix('ReviewAMT_500_t.csv')

Example of sentance sentiment analysis

In [412]:
# sentiment analysis of each sentance in review_amt first essay
plt.figure()
for i in range(len(pred_sent_review_amt[:5])):
    for j in range(len(pred_sent_review_amt[i])):
        plt.plot(pred_sent_review_amt[i][j])
    plt.title('sentiment analysis of each sentance in review_amt first essay')
    plt.xlabel('sentiment class')
    plt.ylabel('power value')
    plt.show()

Conclusions:

Each sentance consist of a big amount of neutral words.

KeyStroke Analysis

In [118]:
from collections import defaultdict
proj_data_path = 'essayVSreview/'

gay_merriage = pd.DataFrame(pd.read_csv(proj_data_path +"GayMarriage_400.csv", sep='\t'))
gun_control = pd.DataFrame(pd.read_csv(proj_data_path + "GunControl_400.csv", sep='\t'))
review_amt = pd.DataFrame(pd.read_csv(proj_data_path +"ReviewAMT_500_t.csv", sep='\t'))
files = [gay_merriage, gun_control, review_amt]
files_names = ['gay_merriage', 'gun_control', 'review_amt']
In [485]:
SPACE = 32
PERIOD = 190
def find_average_writing_pause(file, filename, divider, task):
    '''
    return: list(dict(hundrets: num_of_cases))
    '''
    res = []
    f_ind = file.index
    for id_ in range(f_ind.start, f_ind.stop):
        if not (file['Task'][id_] == task):
            continue
        row_meta = file['ReviewMeta'][id_].split(';')
        row_timestamp = []
        row_keycode = []
        for sep_meta in row_meta:
            if len(sep_meta.split()) < 3:
                continue
            row_int_timestamp = int(sep_meta.split()[0])
            row_int_keycode = int(sep_meta.split()[2])
            row_timestamp.append(row_int_timestamp)
            row_keycode.append(row_int_keycode)
        timestamp_diff = []
        first = 0
        for timestamp_i in range(1, len(row_timestamp)):
            # pauses between words
            # if we remove this line we'll get letters distribution
            if row_keycode[timestamp_i] == divider:
                timestamp_diff.append(row_timestamp[timestamp_i] - row_timestamp[first])
            first = timestamp_i
        dict_ = defaultdict(int)
        for i in timestamp_diff:
            if not (0 < i < 250):
                continue
            dict_[round(i/10) * 10] += 1
        res.append(dict_)
    return res
In [491]:
def draw_plot(divider, task, title):
    # plot of the biggest number of cases
    fig, axs = plt.subplots(3, figsize=[18.5, 10.5])
    fig.suptitle(title)
    for i in range(len(files)):
        file = files[i]
        if i == 2 and task == 'True Essay':
            task = 'True Review'
        if i == 2 and task == 'Fake Essay':
            task = 'Fake Review'
        list_of_essay_pauses = find_average_writing_pause(file, files_names[i], divider, task)
        for dict_ in list_of_essay_pauses:
            try:
                lists = sorted(dict_.items())
                x, y = zip(*lists)
                axs[i].plot(x, y)
                axs[i].set(ylabel='number of pause cases in ' + files_names[i])
            except ValueError:
                continue

    for ax in axs.flat:
        ax.set(xlabel='timestamp pauses btw words')
In [492]:
# True essays words distribution
draw_plot(SPACE, 'True Essay', 'Data word-pauses distribution in TRUE essay:')
# Fake essays words distribution
draw_plot(SPACE, 'Fake Essay', 'Data word-pauses distribution in FAKE essay:')
# Copy True essays words distribution
draw_plot(SPACE, 'Copy_1', 'Data word-pauses distribution in Copy of TRUE essay:')
# Copy Fake essays words distribution
draw_plot(SPACE, 'Copy_1', 'Data word-pauses distribution in Copy of FAKE essay:')
# TODO: draw density function
In [493]:
# True essays sentance distribution
draw_plot(PERIOD, 'True Essay', 'Data sentance-pauses distribution in TRUE essay:')
# Fake essays sentance distribution
draw_plot(PERIOD, 'Fake Essay', 'Data sentance-pauses distribution in FAKE essay:')
# Copy True essays sentance distribution
draw_plot(PERIOD, 'Copy_1', 'Data sentance-pauses distribution in Copy of TRUE essay:')
# Copy Fake essays sentance distribution
draw_plot(PERIOD, 'Copy_2', 'Data sentance-pauses distribution in Copy of FAKE essay:')
# TODO: draw density function

As we can see the most average pause:

  • between letters is 100 miliseconds.
  • between words is from 0 to 150
  • between sentances is from 25 to 125
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

TRASH is below

In [296]:
PERIOD = '190'
ARROWS = [37, 38, 39, 40]
BACKSPACE = 8

SENT_AV_TOP_LIM = 125
SENT_AV_BOTTOM_LIM = 25
def analise_sentance_keystroke(file):
    '''
    list of dictionaries with each file (by sentance) analysis
    return: list(dict('trueEssay': dict('MouseUp', 'Freq'...), 'falseEssay': dict('MouseUp', 'Freq'...)))
    '''
    f_indx = file.index
    sentence_keystroke_true = []
    sentence_keystroke_false = []
    each_sent_res = []
    last_timestamp = 0
    i_end = 0
    
    # each row-essay in a file 
    for id in range(f_indx.start, 1600):
        key_down_freq = []

        keystroke_meta_data = gay_merriage['ReviewMeta'][id].split(';')
        i_start = i_end
        if i_start >= len(keystroke_meta_data):
            continue
        row_meta = keystroke_meta_data[i_start]
        
        if file['Task'][id] in ['Copy_1', 'True Essay']:
            sent_keystroke_data = sentence_keystroke_true
        else:
            sent_keystroke_data = sentence_keystroke_false
        
        while len(row_meta.split()) >= 2 and row_meta.split()[2] != PERIOD:
            i_end += 1
            row_meta = keystroke_meta_data[i_end]
        for sent_event in keystroke_meta_data[i_start:i_end]:
            sent_keystroke_data_dict = defaultdict(int)
            meta_sent_parser = sent_event.split()
            event = meta_sent_parser[1]

            # collect KeyStroke date out of the sentence
            cur_timestamp = int(meta_sent_parser[0])
            timestamp_diff = cur_timestamp - last_timestamp
            
            # choose only 'special' data
            if timestamp_diff > SENT_AV_TOP_LIM or \
                timestamp_diff < SENT_AV_BOTTOM_LIM:
                key_down_freq.append(timestamp_diff)
            last_timestamp = cur_timestamp
            
            if event == 'MouseUp':
                sent_keystroke_data_dict['mouse_select'] += int(meta_sent_parser[3]) - int(meta_sent_parser[2])
            elif int(meta_sent_parser[2]) in ARROWS:
                sent_keystroke_data_dict['arrows'] += 1
            elif int(meta_sent_parser[2]) == BACKSPACE:
                sent_keystroke_data_dict['del'] += 1
            sent_keystroke_data_dict['key_down_freq'] = key_down_freq
            # list of each sentance analysises
            sent_keystroke_data.append(sent_keystroke_data_dict)
        essay_res_with_list_of_sent = {'trueEssay': sentence_keystroke_true, 'falseEssay': sentence_keystroke_false}
        each_sent_res.append(essay_res_with_list_of_sent)
    return each_sent_res
In [297]:
dict_of_gayMeraige_sent_keystroke = analise_sentance_keystroke(files[0])
dict_of_gunControl_sent_keystroke = analise_sentance_keystroke(files[1])
dict_of_reviewAMT_sent_keystroke = analise_sentance_keystroke(files[2])
In [422]:
len(dict_of_gayMeraige_sent_keystroke)
Out[422]:
35
In [459]:
WORDS_AV_TOP_LIM = 0
WORDS_AV_BOTTOM_LIM = 150

def analise_essay_keystroke(file):
    file_ind = file.index
    dict_true = defaultdict(int)
    dict_false = defaultdict(int)
    key_down_freq = []
    last_timestamp = 0
    res= []
    
    for id in range(file_ind.start, file_ind.stop, file_ind.step):
        dict_to_write = dict_true if file['Task'][id] in ['Copy_1', 'True Essay'] else dict_false
        keystroke_meta_data = file['ReviewMeta'][id].split(';')
        
        for i in range(len(keystroke_meta_data)):
            meta_essay_parser = keystroke_meta_data[i].split()
            if (len(meta_essay_parser) < 2):
                continue
            event = meta_essay_parser[1]
            
            # collect KeyStroke date out of the sentence
            cur_timestamp = int(meta_essay_parser[0])
            timestamp_diff = cur_timestamp - last_timestamp
            
            # choose only 'special' data
            if timestamp_diff > WORDS_AV_TOP_LIM or \
                timestamp_diff < WORDS_AV_BOTTOM_LIM:
                if timestamp_diff < 0:
                    cur_timestamp = last_timestamp
                    timestamp_diff = cur_timestamp - last_timestamp
                key_down_freq.append(timestamp_diff)
            last_timestamp = cur_timestamp
            
            if event == 'MouseUp':
                dict_to_write['mouse_select'] += int(meta_essay_parser[3]) - int(meta_essay_parser[2])
            elif int(meta_essay_parser[2]) in ARROWS:
                dict_to_write['arrows'] += 1
            elif int(meta_essay_parser[2]) == BACKSPACE:
                dict_to_write['del'] += 1
            dict_to_write['key_down_freq'] = key_down_freq
            dict_ = defaultdict(int)
            for i in timestamp_diff:
                if not (0 < i < 250):
                    continue
                dict_[round(i/10) * 10] += 1
            res.append(dict_)
        res.append({'trueEssay': dict_true, 'falseEssay': dict_false})
    return res
In [480]:
# One file parsed data keystroke analysis
dict_of_gayMeriage_file_keystroke = analise_essay_keystroke(files[0])
dict_of_gunControl_file_keystroke = analise_essay_keystroke(files[1])
dict_of_reviewAMT_file_keystroke = analise_essay_keystroke(files[2])
In [483]:
key_down_freq = []

max(sorted(list(filter(lambda x: x < 1000, dict_of_reviewAMT_file_keystroke[0]['trueEssay']['key_down_freq']))))


# for i in range(1600):
#     key_down_freq = dict_of_gayMeriage_file_keystroke[i]['trueEssay']['key_down_freq']
Out[483]:
980
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

TODO:

  1. We have prediction of the sentiment analysis in 10 classes, by sentance (1*)
  2. We have keystroke analysis by an essay (2*)
  3. We have to do:
    • keystroke analysis by a sentence (DONE) + combine it with (1*)
    • sentiment by word
    • (DONE) keystroke analysis, by word
    • sentiment analysis, by document + combine it with (2*)

Next steps:

  1. Read about SVD in sentiment analysis: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0217591
  2. Implement our personal sentiment prediction on SVD and PCA
In [ ]:
from collections import defaultdict
PERIOD = '190'
sentance_keystroke_corespondence = defaultdict(int)
file = gay_merriage

for sentence in file['ReviewText'].split('.'):
    sentance_keystroke_corespondence[sentence] = 
In [62]:
len(gay_merriage['ReviewText'][0].split('.'))
Out[62]:
7
In [ ]:
 
In [ ]: